################################################################################
# Code for the analysis of the Monte Carlos
# Estimating Onsets of Binary Events in Panel Data
# Liam F. McGrath
#
# Purpose: This code analyses the results of the monte carlos conducted.
#          The results of the monte carlos are generated by mc_run.R
#          and are contained in collected_props.csv in the output sub folder.
#          Also required is the file scenarios.csv which contains information
#          on each scenario considered in the monte carlos.
#          This file generates figures 1-5 and table 3 in the main text, plus
#          figures 7-9 in the appendix.
#          

rm(list = ls())

library(ggthemes)
library(splines)
library(xtable)

# IMPORTANT:
# Define the path which all the code, data and results are located within
# From this changes in working directory are formed by pasting this
# string with a string for the relevant subfolder.
# If using this code be sure to change to this to your own directory
dir <- "/Users/liammcgrath/Documents/Academic/Research/transition/replication_materials/"

setwd(paste0(dir, "code/"))

# Load in function for combine plots
source("multiplot.R")

## Load in results of monte carlos and info on scenarios
setwd(paste0(dir, "output/"))

newdata <- read.csv("collected_props.csv")
scenarios <- read.csv("scenarios.csv")

# Add some scenario info to the MC results
newdata$d <- scenarios$d
newdata$a <- scenarios$a


# Information on number of observations' outcomes coded to 0
newdata$prop_y_recode <- (newdata$mean_y - newdata$mean_y_trans) / newdata$mean_y
newdata$prop_n_recode <- newdata$mean_y - newdata$mean_y_trans

# Create bias results of estimate for each model

# First as we are focusing on bias relative to beta
# only look at MCs where beta does not equal zero (can't divide by zero)
newdata <- newdata[newdata$b != 0,]

newdata$bias_model1 <- (newdata$mean_b_model1 - newdata$b) / (newdata$b)
newdata$bias_model2 <- (newdata$mean_b_model2 - newdata$b) / (newdata$b)
newdata$bias_model3 <- (newdata$mean_b_model3 - newdata$b) / (newdata$b)
newdata$bias_model4 <- (newdata$mean_b_model4 - newdata$b) / (newdata$b)


# Define the categories of y used in figure 5
newdata$y_category <- NA
newdata$y_category[newdata$mean_y <=0.05] <- "0.00 < y <= 0.05"
newdata$y_category[newdata$mean_y > 0.05 & newdata$mean_y <=0.1] <- "0.05 < y <= 0.10"
newdata$y_category[newdata$mean_y > 0.10 & newdata$mean_y <=0.15] <- "0.10 < y <= 0.15"
newdata$y_category[newdata$mean_y > 0.15 & newdata$mean_y <=0.2] <- "0.15 < y <= 0.20"
newdata$y_category[newdata$mean_y > 0.2 & newdata$mean_y <= 0.25] <- "0.20 < y <= 0.25 "


#######
# ANALYSIS OF MONTE CARLO RESULTS
#######


## For the results in the paper only look at scenarios where
## the proportion of y = 1 in the sample is less than or equal to 0.25
## Results for the full sample occur later on.

newdata_r <- newdata[newdata$mean_y <= 0.25,] # "data restricted"

# First change data format, so that can use facets argument in ggplot

long <- data.frame(model = c(rep("Model 1", nrow(newdata_r)), 
                             rep("Model 2", nrow(newdata_r)), 
                             rep("Model 3", nrow(newdata_r)),
                             rep("Model 4", nrow(newdata_r))
                            )
                   )

long$bias <- c(newdata_r$bias_model1, newdata_r$bias_model2, newdata_r$bias_model3,
               newdata_r$bias_model4)
long$ci_prop <- c(newdata_r$ci_prop_model_1, newdata_r$ci_prop_model_2, 
                  newdata_r$ci_prop_model_3, newdata_r$ci_prop_model4)
long$prop_y_recode <- rep(newdata_r$prop_y_recode, 4)
long$prop_n_recode <- rep(newdata_r$prop_n_recode, 4)
long$rmse <- c(newdata_r$rmse_b_model1, newdata_r$rmse_b_model2, newdata_r$rmse_b_model3,
               newdata_r$rmse_b_model4) /
  rep(abs(newdata_r$b), 4) # relative to beta coefficient
long$b <- rep(newdata_r$b, 4)
long$g <- rep(newdata_r$g, 4)
long$a <- rep(newdata_r$a, 4)
long$d <- rep(newdata_r$d, 4)
long$mean_y <- rep(newdata_r$mean_y, 4)
long$mean_y_trans <-rep(newdata_r$mean_y_trans, 4)
long$y_category <- rep(newdata_r$y_category, 4)


# save(long, file = "long_results.RData")

####
# Table 3: Summarising the overal biases
#####


bias_output <- data.frame(indic = c("Mean of Bias (%)", "Mean of 95% CI Non-Coverage", 
                                    "Mean of $y$", "Mean of Recoded $y$")
)
bias_output$model1 <- NA
bias_output$model2 <- NA
bias_output$model3 <- NA
bias_output$model4 <- NA

bias_output[1, 2:5] <- tapply(long$bias * 100, long$model, mean)
bias_output[2, 2:5] <- tapply(100 * (long$ci_prop - 0.95) / 0.95, long$model, mean)
bias_output[3, 2:5] <- rep(mean(newdata_r$mean_y) , 4)
bias_output[4, 2:5] <- rep(mean(newdata_r$mean_y_trans) , 4)

# print(xtable(bias_output), include.rownames = FALSE, file = "bias_output_tab.tex")


####################
# FIGURES IN THE MAIN TEXT
#####################

# Create model var with a more descriptive name
long$model_d[long$model == "Model 1"] <- "Model 1: Ongoing set to Zero (a)"
long$model_d[long$model == "Model 2"] <- "Model 2: Ongoing set to Zero (b)"
long$model_d[long$model == "Model 3"] <- "Model 3: First Order Markov"
long$model_d[long$model == "Model 4"] <- "Model 4: Ongoing set to Zero + Lag"

### FIGURE 1: Bias and confidence interval non-coverage as a function of the 
#             proportion of y = 1 in the sample

# Bias
fig1_b <- ggplot(data = long, aes(y = bias * 100, x = mean_y)) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + geom_smooth(colour = "black") + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where y = 1 in the Sample") +
  ylab("Bias (%)")

# CI Prop

fig1_ci <- ggplot(data = long, aes(y = 100 * (ci_prop - 0.95) / 0.95, 
                                   x = mean_y)) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + geom_smooth(colour = "black") + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where y = 1 in the Sample") +
  ylab("95% Confidence Interval Non-Coverage (%)")

# pdf("fig1.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(fig1_b, fig1_ci)
# dev.off()


### FIGURE 2: Bias and confidence interval non-coverage as a function of the 
#             proportion of observations where ongoing years are set to zero

# Bias
fig2_b <- ggplot(data = long, aes(y = bias * 100, x = prop_n_recode)) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + geom_smooth(colour = "black") + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("Bias (%)")

# CI Prop

fig2_ci <- ggplot(data = long, aes(y = 100 * (ci_prop - 0.95) / 0.95, 
                                   x = prop_n_recode)) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + geom_smooth(colour = "black") + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("95% Confidence Interval Non-Coverage (%)")

# 
# pdf("fig2.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(fig2_b, fig2_ci)
# dev.off()

### FIGURE 3: Bias and confidence interval non-coverage as a function of the 
#             proportion of observations where ongoing years are set to zero
#             conditional upon the absolute value of the effect on onsets (beta)


# Bias
fig3_b <- ggplot(data = long, aes(y = bias * 100, x = prop_n_recode,
                                  group = as.factor(abs(b)),
                                  colour = as.factor(abs(b)))) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) + 
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("Bias (%)")

# CI Prop

fig3_ci <- ggplot(data = long, aes(y = 100 * (ci_prop - 0.95) / 0.95, x = prop_n_recode,
                                   group = as.factor(abs(b)),
                                   colour = as.factor(abs(b)))) +
  geom_point(colour = "black", alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) +
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("95% Confidence Interval Non-Coverage (%)")



# pdf("fig3.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(fig3_b, fig3_ci)
# dev.off()

### FIGURE 4: Bias and confidence interval non-coverage as a function of the 
#             proportion of observations where y = 1
#             conditional upon the absolute value of the effect on onsets (beta)

# Bias
fig4_b <- ggplot(data = long, aes(y = bias * 100, x = prop_y_recode,
                                  group = as.factor(abs(b)),
                                  colour = as.factor(abs(b)))) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) + 
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + 
  xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("Bias (%)")

# CI Prop

fig4_ci <- ggplot(data = long, aes(y = 100 * (ci_prop - 0.95) / 0.95, x = prop_y_recode,
                                   group = as.factor(abs(b)),
                                   colour = as.factor(abs(b)))) +
  geom_point(colour = "black", alpha = I(0.1)) + 
  facet_grid(. ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) +
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + 
  xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("95% Confidence Interval Non-Coverage (%)")

# 
# pdf("fig4.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(fig4_b, fig4_ci)
# dev.off()

### FIGURE 5: Bias and confidence interval non-coverage as a function of the 
#             proportion of observations where y = 1. Small multiple plots
#             where the distribution of y, according to categorisations introduced
#             earlier, partition the data. 
#             Conditional upon the absolute value of the effect on onsets (beta)

# Bias
fig5_b <- ggplot(data = long, aes(y = bias * 100, x = prop_y_recode,
                                  group = as.factor(abs(b)),
                                  colour = as.factor(abs(b)))) +
  geom_point(alpha = I(0.1)) + 
  facet_grid(y_category ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) + 
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + 
  xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("Bias (%)") + 
  theme(legend.position = "none") +
  theme(plot.margin = unit(c(0.5,2,0.5,0.5), "cm")) 


# CI Prop

fig5_ci <- ggplot(data = long, aes(y = 100 * (ci_prop - 0.95) / 0.95, x = prop_y_recode,
                                   group = as.factor(abs(b)),
                                   colour = as.factor(abs(b)))) +
  geom_point(colour = "black", alpha = I(0.1)) + 
  facet_grid(y_category ~ model_d) + 
  scale_colour_grey(name="Effect of x upon \nOnsets \n(Absolute Value)", 
                    breaks=c("0.5", "1", "1.5", "2", "2.5"), 
                    labels=c("0.5", "1", "1.5", "2", "2.5")) +
  geom_smooth(alpha = I(0.1)) + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + 
  xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("95% Confidence Interval Non-Coverage (%)") +
  theme(legend.direction = "horizontal", legend.position = "bottom", 
        legend.box = "vertical") +
  theme(plot.margin = unit(c(0.5,2,0.5,0.5), "cm")) 

# pdf("fig5.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(fig5_b, fig5_ci)
# grid.text("Categories of the Dependent Variable Based upon the Proportion of Observations where y = 1" ,
#           x=0.975,y=0.5,rot = 270, 
#           gp=gpar(fontsize=12)) 
# dev.off()


#####
# APPENDIX GRAPHS
#####

# Now focus on results for the whole sample
# Reshaped to use the whole sample rather than y <= 0.25
# (This is for some appendix graphs)

long_full <- data.frame(model = c(rep("Model 1", nrow(newdata)), 
                                  rep("Model 2", nrow(newdata)), 
                                  rep("Model 3", nrow(newdata)),
                                  rep("Model 4", nrow(newdata))
)
)

long_full$model_d[long_full$model == "Model 1"] <- "Model 1: Ongoing set to Zero (a)"
long_full$model_d[long_full$model == "Model 2"] <- "Model 2: Ongoing set to Zero (b)"
long_full$model_d[long_full$model == "Model 3"] <- "Model 3: First Order Markov"
long_full$model_d[long_full$model == "Model 4"] <- "Model 4: Ongoing set to Zero + Lag"

long_full$bias <- c(newdata$bias_model1, newdata$bias_model2, newdata$bias_model3,
                    newdata$bias_model4)
long_full$ci_prop <- c(newdata$ci_prop_model_1, newdata$ci_prop_model_2, 
                       newdata$ci_prop_model_3, newdata$ci_prop_model4)
long_full$prop_y_recode <- rep(newdata$prop_y_recode, 4)
long_full$prop_n_recode <- rep(newdata$prop_n_recode, 4)
long_full$rmse <- c(newdata$rmse_b_model1, newdata$rmse_b_model2, newdata$rmse_b_model3,
                    newdata$rmse_b_model4) /
  rep(abs(newdata$b), 4) # relative to beta coefficient
long_full$b <- rep(newdata$b, 4)
long_full$g <- rep(newdata$g, 4)
long_full$a <- rep(newdata$a, 4)
long_full$d <- rep(newdata$d, 4)
long_full$mean_y <- rep(newdata$mean_y, 4)
long_full$mean_y_trans <-rep(newdata$mean_y_trans, 4)
long_full$y_category <- rep(newdata$y_category, 4)


### Graphs for appendix

# FIGURE 7:
# A1) Replicate fig1 with full experiments


# Bias
figa1_b <- qplot(data = long_full, y = bias * 100, x = mean_y, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where y = 1 in the Sample") +
  ylab("Bias (%)")

# CI Prop
figa1_ci <- qplot(data = long_full, y = 100 * (ci_prop - 0.95) / 0.95, x = mean_y, 
                  facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where y = 1 in the Sample") +
  ylab("95% Confidence Interval Non-Coverage")

# RMSE
figa1_r <- qplot(data = long_full, y = rmse, x = mean_y, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where y = 1 in the Sample") +
  ylab("Root Mean Squared Error")

# pdf("figa1.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(figa1_b, figa1_ci, figa1_r)
# dev.off()

# Figure 8
# A2: Proportion of n, full sample. replication of figure 2
# Bias
figa2_b <- qplot(data = long_full, y = bias * 100, x = prop_n_recode, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("Bias (%)")

# CI Prop
figa2_ci <- qplot(data = long_full, y = 100 * (ci_prop - 0.95) / 0.95, x = prop_n_recode, 
                  facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("95% Confidence Interval Non-Coverage")

# RMSE
figa2_r <- qplot(data = long_full, y = rmse, x = prop_n_recode, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Proportion of Observations where Ongoing Years Are Set to Zero") +
  ylab("Root Mean Squared Error")

# pdf("figa2.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(figa2_b, figa2_ci, figa2_r)
# dev.off()

# Figure 9: 
# A3: proportion of y recoded, full sample. Replication of figure 4 with full sample. 

# Bias
figa3_b <- qplot(data = long_full, y = bias * 100, x = prop_y_recode, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("Bias (%)")

# CI Prop
figa3_ci <- qplot(data = long_full, y = 100 * (ci_prop - 0.95) / 0.95, x = prop_y_recode, 
                  facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("95% Confidence Interval Non-Coverage")

# RMSE
figa3_r <- qplot(data = long_full, y = rmse, x = prop_y_recode, 
                 facets = . ~ model_d, alpha = I(0.2)) + geom_smooth() + 
  geom_hline(yintercept = 0, linetype = "dashed") + 
  theme_bw() + xlab("Number of Observations where Ongoing Years set to Zero as a Proportion of the Number of Observations where y = 1") +
  ylab("Root Mean Squared Error")
# 
# pdf("figa3.pdf", height = 15, width = 13, family = "Palatino")
# multiplot(figa3_b, figa3_ci, figa3_r)
# dev.off()
